University of San Francisco
We’re making a trade-off as marketers between:
Criteria
Approaches
Steps
Data sources
iid spend_online spend_retail age
Min. : 14 Min. : 0.00 Min. : 0.00 Min. :18.00
1st Qu.: 2946 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:33.00
Median : 5430 Median : 14.97 Median : 27.71 Median :41.00
Mean : 5463 Mean : 72.44 Mean : 78.00 Mean :40.91
3rd Qu.: 8110 3rd Qu.: 70.72 3rd Qu.: 78.00 3rd Qu.:49.00
Max. :10589 Max. :1985.75 Max. :2421.91 Max. :88.00
white college male hh_inc
Min. :0.0000 Min. :0.0000 Min. :0.000 Min. : 2.499
1st Qu.:0.7297 1st Qu.:0.3835 1st Qu.:0.000 1st Qu.: 59.356
Median :0.8550 Median :0.5580 Median :0.000 Median : 87.364
Mean :0.7993 Mean :0.5437 Mean :0.091 Mean : 96.254
3rd Qu.:0.9422 3rd Qu.:0.7136 3rd Qu.:0.000 3rd Qu.:122.602
Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :250.001
Data pre-processing
Data pre-processing
Data pre-processing
Data pre-processing
Is 199 more similar to 1163 or 9594?
What is the Euclidean distance between customers 199 and 1163 on spend_online, hh_inc?
Standardize first
spend_online_dif <- ( DF[ DF$iid == 199, ]$spend_online_standardized - DF[ DF$iid == 1163, ]$spend_online_standardized) ^ 2
hh_inc_dif <- ( DF[ DF$iid == 199, ]$hh_inc_standardized - DF[ DF$iid == 1163, ]$hh_inc_standardized) ^ 2
ed_199_1163 <- sqrt( spend_online_dif + hh_inc_dif)
ed_199_1163[1] 0.3264905
What is the Euclidean distance between customers 199 and 9594 on spend_online, hh_inc?
spend_online_dif <- ( DF[ DF$iid == 199, ]$spend_online_standardized - DF[ DF$iid == 9594, ]$spend_online_standardized) ^ 2
hh_inc_dif <- ( DF[ DF$iid == 199, ]$hh_inc_standardized - DF[ DF$iid == 9594, ]$hh_inc_standardized) ^ 2
ed_199_9594 <- sqrt( spend_online_dif + hh_inc_dif)
ed_199_9594[1] 1.273762
R implementation of Euclidean & Gower distance
daisy() function in cluster package
daisy(DF, metric = "euclidean", warnType = FALSE, stand = TRUE)
DF: dataframe with all continuous variableswarnType=FALSE to silence warningsstand=TRUE to standardize variablesdaisy(DF, metric = "gower", warnType=FALSE)
DF: dataframe with continuous/binary variableswarnType=FALSE to silence warningsR implementation of Euclidean & Gower distance
daisy() function in cluster package
R implementation of Euclidean & Gower distance
daisy() function in cluster package
Two main branches of clustering algorithms
K-means clustering algorithm
K-means clustering algorithm
K-means clustering algorithm
K-means clustering algorithm
df$distance_1 <- sqrt( (df$x - df[centroids[1],]$x) ^ 2 + (df$y - df[centroids[1],]$y) ^ 2)
df$distance_2 <- sqrt( (df$x - df[centroids[2],]$x) ^ 2 + (df$y - df[centroids[2],]$y) ^ 2)
df$distance_3 <- sqrt( (df$x - df[centroids[3],]$x) ^ 2 + (df$y - df[centroids[3],]$y) ^ 2)
df$cluster <- ifelse(df$distance_1 < df$distance_2 & df$distance_1 < df$distance_3, 1,
ifelse( df$distance_2 < df$distance_1 & df$distance_2 < df$distance_3, 2,
ifelse( df$distance_3 < df$distance_1 & df$distance_3 < df$distance_2, 3, NA_real_)))K-means clustering algorithm
K-means clustering algorithm
df$distance_1 <- sqrt( (df$x - centroids[1,]$x) ^ 2 + (df$y - centroids[1,]$y) ^ 2)
df$distance_2 <- sqrt( (df$x - centroids[2,]$x) ^ 2 + (df$y - centroids[2,]$y) ^ 2)
df$distance_3 <- sqrt( (df$x - centroids[3,]$x) ^ 2 + (df$y - centroids[3,]$y) ^ 2)
df$cluster_2 <- ifelse(df$distance_1 < df$distance_2 & df$distance_1 < df$distance_3, 1,
ifelse( df$distance_2 < df$distance_1 & df$distance_2 < df$distance_3, 2,
ifelse( df$distance_3 < df$distance_1 & df$distance_3 < df$distance_2, 3, NA_real_)))
sum(df$cluster != df$cluster_2)[1] 8
K-means clustering algorithm
df$distance_1 <- sqrt( (df$x - centroids[1,]$x) ^ 2 + (df$y - centroids[1,]$y) ^ 2)
df$distance_2 <- sqrt( (df$x - centroids[2,]$x) ^ 2 + (df$y - centroids[2,]$y) ^ 2)
df$distance_3 <- sqrt( (df$x - centroids[3,]$x) ^ 2 + (df$y - centroids[3,]$y) ^ 2)
df$cluster_3 <- ifelse(df$distance_1 < df$distance_2 & df$distance_1 < df$distance_3, 1,
ifelse( df$distance_2 < df$distance_1 & df$distance_2 < df$distance_3, 2,
ifelse( df$distance_3 < df$distance_1 & df$distance_3 < df$distance_2, 3, NA_real_)))
sum(df$cluster_2 != df$cluster_3)[1] 10
K-means clustering algorithm
df$distance_1 <- sqrt( (df$x - centroids[1,]$x) ^ 2 + (df$y - centroids[1,]$y) ^ 2)
df$distance_2 <- sqrt( (df$x - centroids[2,]$x) ^ 2 + (df$y - centroids[2,]$y) ^ 2)
df$distance_3 <- sqrt( (df$x - centroids[3,]$x) ^ 2 + (df$y - centroids[3,]$y) ^ 2)
df$cluster_4 <- ifelse(df$distance_1 < df$distance_2 & df$distance_1 < df$distance_3, 1,
ifelse( df$distance_2 < df$distance_1 & df$distance_2 < df$distance_3, 2,
ifelse( df$distance_3 < df$distance_1 & df$distance_3 < df$distance_2, 3, NA_real_)))
sum(df$cluster_4 != df$cluster_3)[1] 5
Once we have the distance (similarity) matrix from daisy():
kmeans() to perform the cluster analysis
nstart option to multi-start algorithm from multiple random points
DF$clu_gower_4 <- kmeans(DF_gower, centers = 4, nstart = 10)DF$clu_euclid_4 <- kmeans(DF_euclidean, centers = 4, nstart = 10)In total:
You can also do this in one step, but I wouldn’t recommend it
Elbow plot example from df
What was the within-cluster variation from the example I had?
Elbow plot example from df
# Euclidean distance
df_euclidean <- daisy(df, metric = "euclidean", warnType = FALSE, stand = TRUE)
# max number of clusters to test
max_clusters <- 10
# list to hold within-cluster sum-of-squares
wss <- rep(0, max_clusters)
# loop over cluster
for (i in 1:max_clusters) {
segments <- kmeans(df_euclidean, centers = i, nstart=10)
wss[i] <- sum(segments$withinss) # within-cluster sum-of-squares, summed over clusters
}
as.data.frame(wss) wss
1 402.138498
2 149.588725
3 26.886629
4 21.013860
5 16.578999
6 14.285304
7 12.939867
8 11.182604
9 10.316321
10 9.181797
Elbow plot example from df
Elbow plot example from DF
# Euclidean distance
DF_euclidean <- daisy(DF, metric = "euclidean", warnType = FALSE, stand = TRUE)
# max number of clusters to test
max_clusters <- 10
# list to hold within-cluster sum-of-squares
wss <- rep(0, max_clusters)
# loop over cluster
for (i in 1:max_clusters) {
segments <- kmeans(DF_euclidean, centers = i, nstart=10)
wss[i] <- sum(segments$withinss) # within-cluster sum-of-squares, summed over clusters
}
as.data.frame(wss) wss
1 5441463.4
2 3182491.3
3 1862694.3
4 1550044.8
5 1310505.9
6 1232772.6
7 1194428.1
8 786449.4
9 747327.0
10 720655.1
Elbow plot example from DF
3 or 4 is best
A simple way (among many) to compute sizes:
segments <- kmeans(DF_euclidean, centers = 4, nstart=10)
DF$cluster <- segments$cluster
library(dplyr)
DF |>
group_by(cluster) |>
summarise(size = n(),
proportion = round(n()/nrow(DF), 3))# A tibble: 4 × 3
cluster size proportion
<int> <int> <dbl>
1 1 163 0.163
2 2 386 0.386
3 3 18 0.018
4 4 433 0.433
A simple way (among many) to compute means:
DF |>
group_by(cluster) |>
summarise(across(c(spend_online, spend_retail, age, white, college, male, hh_inc), mean)) |>
round(3)# A tibble: 4 × 8
cluster spend_online spend_retail age white college male hh_inc
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 137. 136. 42.8 0.711 0.524 0.552 102.
2 2 31.1 50.0 43.3 0.878 0.695 0 126.
3 3 777. 841. 44.2 0.838 0.548 0.056 117.
4 4 55.7 49.2 37.9 0.761 0.416 0 67.0